This workflow reanalyze the single nucleus RNA-seq data produced by (Habib et al. 2017), using DroNc-seq: massively parallel sNuc-seq with droplet technology.
We will first load a few libraries. Among them, * DropletUtils provides functions for data from droplet technologies such as 10X Genomics. * biomaRt provides easy access to databases, such as Ensembl, COSMIC, Uniprot, HGNC, etc. * scater is a collection of tools for doing quality control analyses of scRNA-seq * scran provide functions for normalization of cell-specific libary sizes, correcting batch effects, and identification marker genes
bio_packs = c("SingleCellExperiment","DropletUtils","biomaRt",
"scater","scran","limma","org.Hs.eg.db","SC3")
bio_packs = bio_packs[! bio_packs %in% installed.packages()[,"Package"]]
if( length(bio_packs) > 0 ){
source("https://bioconductor.org/biocLite.R")
biocLite(bio_packs, suppressUpdates = TRUE)
}
cran_packs = c("stringi","irlba")
cran_packs = cran_packs[! cran_packs %in% installed.packages()[,"Package"]]
if( length(cran_packs) > 0 ){
install.packages(cran_packs)
}
library(data.table)
library(SingleCellExperiment)
library(DropletUtils)
library(biomaRt)
library(scater)
library(scran)
library(limma)
library(ggplot2)
src_dir = "~/research/GitHub/scRNAseq_pipelines/dronc"
data_dir = "~/research/scRNAseq/workflow/data"
dronc_dir = "~/research/scRNAseq/data/GTEx_droncseq_hip_pcf"Next we read in the count data. The dataset is available here.
fname = file.path(dronc_dir, "GTEx_droncseq_hip_pcf.umi_counts.txt.gz")
counts = fread(paste('zcat <', fname))
dim(counts)## [1] 32111 14964
counts[1:3,1:2]## V1 hHP1_AACACTATCTAC
## 1: A1BG 0
## 2: A1BG-AS1 0
## 3: A1CF 0
counts.mx = as.matrix(counts[,-1])
rownames(counts.mx) = counts$V1
sce = SingleCellExperiment(assays = list(counts = counts.mx))
sce## class: SingleCellExperiment
## dim: 32111 14963
## metadata(0):
## assays(1): counts
## rownames(32111): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(0):
## reducedDimNames(0):
## spikeNames(0):
We will extract annotation information based on gene names
anno.file = file.path(data_dir, "gene.annoation_dronc.rds")
if(file.exists(anno.file)){
gene_anno = readRDS(anno.file)
}else{
ensembl = useEnsembl(biomart="ensembl",GRCh=37,
dataset="hsapiens_gene_ensembl")
attr_string = c("hgnc_symbol","external_gene_name","chromosome_name",
"start_position","end_position","strand","description",
"percentage_gene_gc_content","gene_biotype")
gene_anno = getBM(attributes = attr_string,
filters = "external_gene_name",
values = rownames(sce),
mart = ensembl)
saveRDS(gene_anno, file=anno.file)
}
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34526 9
w2rm = which(!gene_anno$external_gene_name %in% rownames(sce))
w2rm## [1] 4232 4645
gene_anno[w2rm,]## hgnc_symbol external_gene_name chromosome_name start_position
## 4232 C10ORF68 10 32832297
## 4645 C1ORF220 1 178511950
## end_position strand
## 4232 33171802 1
## 4645 178517579 1
## description
## 4232 Homo sapiens coiled-coil domain containing 7 (CCDC7), transcript variant 5, mRNA. [Source:RefSeq mRNA;Acc:NM_024688]
## 4645
## percentage_gene_gc_content gene_biotype
## 4232 37.23 protein_coding
## 4645 49.91 protein_coding
gene_anno = gene_anno[-w2rm,]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34524 9
Many genes have multiple entries in annotation, often because they are annotatd to scaffolds, assembly patches and alternate loci. Here we simply remove such entries. The we remove duplicated annotations and genes without annotations.
table(gene_anno$chromosome_name)[1:30]##
## 1 10 11 12 13 14
## 3084 1252 1741 1694 637 1145
## 15 16 17 18 19 2
## 1169 1426 1820 633 1955 2186
## 20 21 22 3 4 5
## 800 383 704 1784 1301 1567
## 6 7 8 9 GL000192.1 GL000193.1
## 1622 1446 1296 1207 2 1
## GL000194.1 GL000195.1 GL000199.1 GL000204.1 GL000205.1 GL000213.1
## 2 1 1 1 2 1
chr.nms = c(1:22,"X","Y","MT")
gene_anno = gene_anno[which(gene_anno$chromosome_name %in% chr.nms),]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 31978 9
t1 = table(gene_anno$external_gene_name)
t2 = sort(t1[t1 > 1], decreasing=TRUE)
length(t2)## [1] 40
t2[1:10]##
## SNORD113 MIR1302-2 NPIPA7 CCDC177 CDRT1
## 6 4 3 2 2
## CEBPA-AS1 FAM226B FAM47E-STBD1 GATS GOLGA7B
## 2 2 2 2 2
gene_anno[which(gene_anno$external_gene_name %in% names(t2)[1:4]), 1:4]## hgnc_symbol external_gene_name chromosome_name start_position
## 5013 CCDC177 CCDC177 14 70037483
## 5014 CCDC177 14 70038216
## 14832 MIR1302-11 MIR1302-2 19 71161
## 14833 MIR1302-10 MIR1302-2 19 71161
## 14834 MIR1302-9 MIR1302-2 19 71161
## 14835 MIR1302-2 MIR1302-2 19 71161
## 16502 NPIPA7 16 16411301
## 16503 NPIPA7 NPIPA7 16 16472912
## 16505 NPIPA7 16 18451943
## 29789 SNORD113 14 101422577
## 29810 SNORD113 14 101443726
## 29811 SNORD113 14 101445339
## 29812 SNORD113 14 101446329
## 29825 SNORD113 14 101460594
## 29826 SNORD113 14 101464804
w.duplicate = which(gene_anno$external_gene_name %in% names(t2))
ganno2 = gene_anno[w.duplicate,]
dim(ganno2)## [1] 87 9
table(ganno2$hgnc_symbol == ganno2$external_gene_name)##
## FALSE TRUE
## 46 41
ganno2 = ganno2[which(ganno2$hgnc_symbol == ganno2$external_gene_name),]
dim(ganno2)## [1] 41 9
ganno2 = dplyr::distinct(ganno2,external_gene_name,.keep_all = TRUE)
dim(ganno2)## [1] 39 9
gene_anno = rbind(gene_anno[-w.duplicate,], ganno2)
dim(gene_anno)## [1] 31930 9
table(gene_anno$gene_biotype)##
## 3prime_overlapping_ncrna antisense IG_C_gene
## 11 4152 3
## IG_V_gene lincRNA miRNA
## 6 4807 966
## misc_RNA Mt_rRNA Mt_tRNA
## 935 2 18
## processed_transcript protein_coding pseudogene
## 394 18296 3
## rRNA sense_intronic sense_overlapping
## 204 648 172
## snoRNA snRNA TR_C_gene
## 201 1083 5
## TR_J_gene TR_V_gene
## 7 17
gene_missing = setdiff(rownames(sce),gene_anno$external_gene_name)
gene_missing[1:10]## [1] "AC005152.2" "AC006132.1" "AC006445.8" "AC007092.1" "AC007390.5"
## [6] "AC007464.1" "AC009232.2" "AC009236.1" "AC010127.5" "AC011043.1"
length(gene_missing)## [1] 181
w2kp = match(gene_anno$external_gene_name,rownames(sce))
table(is.na(w2kp))##
## FALSE
## 31930
gene_anno$external_gene_name[which(is.na(w2kp))]## character(0)
sce = sce[w2kp,]
dim(sce)## [1] 31930 14963
rowData(sce) = gene_annoPlease refer to this workflow in bioconductor for reference.
# Calling cells from empty droplets
bcrank = barcodeRanks(counts(sce))
# Only show unique points for plotting speed.
uniq = !duplicated(bcrank$rank)
par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=bcrank$inflection, col="darkgreen", lty=2,lwd=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2,lwd=2)
legend("left", legend=c("Inflection", "Knee"), bty="n",
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2,lwd=2)bcrank$inflection## hCf_TAATAGCGGTAC
## 221
bcrank$knee## PFC2-A2_CACAGCGCTGTC
## 580
summary(bcrank$total)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 459.0 716.0 948.9 1167.0 9423.0
table(bcrank$total >= bcrank$knee)##
## FALSE TRUE
## 5513 9450
table(bcrank$total >= bcrank$inflection)##
## FALSE TRUE
## 1 14962
set.seed(100)
e.out = emptyDrops(counts(sce))
e.out## DataFrame with 14963 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## hHP1_AACACTATCTAC 4484 NaN 1 FALSE 0
## hHP1_CTACGCATCCAT 4919 NaN 1 FALSE 0
## hHP1_TCGGTACTAATA 5163 NaN 1 FALSE 0
## hHP1_CCCGCACGCTAT 5030 NaN 1 FALSE 0
## hHP1_TCATTTTGTCAT 3588 NaN 1 FALSE 0
## ... ... ... ... ... ...
## PFC-CD_TTGCCTGGCGGG 590 NaN 1 FALSE 0
## PFC-CD_CACGCTCCCCTA 269 NaN 1 FALSE 1
## PFC-CD_GCTCTACAACCG 522 NaN 1 FALSE 1
## PFC-CD_CTCCATTCATGC 497 NaN 1 FALSE 1
## PFC-CD_CGTCATTAGCAG 568 NaN 1 FALSE 1
length(unique(e.out$FDR))## [1] 2
table(e.out$FDR)##
## 0 1
## 9450 5513
tapply(e.out$Total, e.out$FDR, summary)## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 581 748 1011 1268 1487 9423
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 317.0 391.0 402.3 489.0 580.0
From the above analysis, some cells with very small number of UMI’s. Here we chose do not remove any cells because it appears all these 14,963 cells were used in the main anlaysis. Based on Figure 2 of their paper, there are
14,963 DroNc-seq nuclei profiles (each with >10,000 reads and >200 genes)
We will generate a set of QC features per cell, including the expression of mitochondira/ribosomal genes. We identify ribosomal genes based on annoation from https://www.genenames.org/.
file_link = "https://www.genenames.org/cgi-bin/genefamilies/set/1054/download/branch"
file_name = "ribosome_genes.txt"
ribo_fn = file.path(data_dir, file_name)
if( !file.exists(ribo_fn) ){
cmd = sprintf("cd %s; wget %s",data_dir, file_link)
print(cmd)
system(cmd)
}
ribo = read.table(ribo_fn, sep='\t', header=TRUE, stringsAsFactors = FALSE)
ribo[1:2,]## HGNC.ID Approved.Symbol Approved.Name Status Previous.Symbols
## 1 10298 RPL10 ribosomal protein L10 Approved
## 2 10299 RPL10A ribosomal protein L10a Approved NEDD6
## Synonyms Chromosome Accession.Numbers
## 1 NOV, QM, DXS648E, DXS648, FLJ23544, L10 Xq28 AB007170
## 2 Csa-19, L10A 6p21.31 U12404
## RefSeq.IDs Gene.Family.Tag Gene.family.description Gene.family.ID
## 1 NM_006013 RPL L ribosomal proteins 729
## 2 NM_007104 RPL L ribosomal proteins 729
is_mito = which(rowData(sce)$chromosome_name == "MT")
is_ribo = which(rowData(sce)$hgnc_symbol %in% ribo$Approved.Symbol)
length(is_mito)## [1] 33
length(is_ribo)## [1] 160
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is_mito, Ri=is_ribo))
sort(colnames(colData(sce)))## [1] "is_cell_control"
## [2] "log10_total_counts"
## [3] "log10_total_counts_endogenous"
## [4] "log10_total_counts_feature_control"
## [5] "log10_total_counts_Mt"
## [6] "log10_total_counts_Ri"
## [7] "log10_total_features"
## [8] "log10_total_features_by_counts"
## [9] "log10_total_features_by_counts_endogenous"
## [10] "log10_total_features_by_counts_feature_control"
## [11] "log10_total_features_by_counts_Mt"
## [12] "log10_total_features_by_counts_Ri"
## [13] "log10_total_features_endogenous"
## [14] "log10_total_features_feature_control"
## [15] "log10_total_features_Mt"
## [16] "log10_total_features_Ri"
## [17] "pct_counts_endogenous"
## [18] "pct_counts_feature_control"
## [19] "pct_counts_in_top_100_features"
## [20] "pct_counts_in_top_100_features_endogenous"
## [21] "pct_counts_in_top_100_features_feature_control"
## [22] "pct_counts_in_top_100_features_Ri"
## [23] "pct_counts_in_top_200_features"
## [24] "pct_counts_in_top_200_features_endogenous"
## [25] "pct_counts_in_top_50_features"
## [26] "pct_counts_in_top_50_features_endogenous"
## [27] "pct_counts_in_top_50_features_feature_control"
## [28] "pct_counts_in_top_50_features_Ri"
## [29] "pct_counts_in_top_500_features"
## [30] "pct_counts_in_top_500_features_endogenous"
## [31] "pct_counts_Mt"
## [32] "pct_counts_Ri"
## [33] "pct_counts_top_100_features"
## [34] "pct_counts_top_100_features_endogenous"
## [35] "pct_counts_top_100_features_feature_control"
## [36] "pct_counts_top_100_features_Ri"
## [37] "pct_counts_top_200_features"
## [38] "pct_counts_top_200_features_endogenous"
## [39] "pct_counts_top_50_features"
## [40] "pct_counts_top_50_features_endogenous"
## [41] "pct_counts_top_50_features_feature_control"
## [42] "pct_counts_top_50_features_Ri"
## [43] "pct_counts_top_500_features"
## [44] "pct_counts_top_500_features_endogenous"
## [45] "total_counts"
## [46] "total_counts_endogenous"
## [47] "total_counts_feature_control"
## [48] "total_counts_Mt"
## [49] "total_counts_Ri"
## [50] "total_features"
## [51] "total_features_by_counts"
## [52] "total_features_by_counts_endogenous"
## [53] "total_features_by_counts_feature_control"
## [54] "total_features_by_counts_Mt"
## [55] "total_features_by_counts_Ri"
## [56] "total_features_endogenous"
## [57] "total_features_feature_control"
## [58] "total_features_Mt"
## [59] "total_features_Ri"
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="",
breaks=20, col="grey80", ylab="Number of cells")
hist(log10(sce$total_features), xlab="log10(# of expressed genes)",
main="", breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
ylab="Number of cells", breaks=40, main="", col="grey80")
hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)",
ylab="Number of cells", breaks=80, main="", col="grey80")par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
plot(log10(sce$total_counts), log10(sce$total_features),
xlab="log10(Library sizes)", ylab="log10(# of expressed genes)",
pch=16, col=rgb(0,0,0,0.4))
plot(log10(sce$total_counts), sce$pct_counts_Ri, pch=16, col=rgb(0,0,0,0.4),
xlab="log10(Library sizes)", ylab="Ribosome prop. (%)")
plot(log10(sce$total_counts), sce$pct_counts_Mt, pch=16, col=rgb(0,0,0,0.4),
xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)")
plot(x=sce$pct_counts_Ri, y=sce$pct_counts_Mt, pch=16, col=rgb(0,0,0,0.4),
xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)")rowData(sce)[1:2,]## DataFrame with 2 rows and 20 columns
## hgnc_symbol external_gene_name chromosome_name start_position
## <character> <character> <character> <integer>
## 1 AC006946.16 22 17548711
## 2 AC006946.17 22 17561591
## end_position strand description percentage_gene_gc_content
## <integer> <integer> <character> <numeric>
## 1 17551565 1 41.61
## 2 17562346 1 41.67
## gene_biotype is_feature_control is_feature_control_Mt
## <character> <logical> <logical>
## 1 lincRNA FALSE FALSE
## 2 lincRNA FALSE FALSE
## is_feature_control_Ri mean_counts log10_mean_counts
## <logical> <numeric> <numeric>
## 1 FALSE 0.00548018445498897 0.0023735161394708
## 2 FALSE 0.00360890195816347 0.00156450482886486
## n_cells_by_counts pct_dropout_by_counts total_counts log10_total_counts
## <integer> <numeric> <integer> <numeric>
## 1 79 99.4720310098242 82 1.91907809237607
## 2 52 99.6524761077324 54 1.74036268949424
## n_cells_counts pct_dropout_counts
## <integer> <numeric>
## 1 79 99.4720310098242
## 2 52 99.6524761077324
min(rowData(sce)$mean_counts)## [1] 6.683152e-05
min(rowData(sce)$mean_counts[rowData(sce)$mean_counts>0])## [1] 6.683152e-05
min(rowData(sce)$n_cells_counts)## [1] 1
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80", main="",
breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_counts+1), col="grey80", main="",
breaks=40, xlab="log10(# of expressed cells + 1)")
plot(log10(rowData(sce)$mean_counts+1e-6), pch=16, col=rgb(0,0,0,0.4),
log10(rowData(sce)$n_cells_counts + 1),
xlab="log10(ave # of UMI + 1e-6)",
ylab="log10(# of expressed cells + 1)")tb1 = table(rowData(sce)$n_cells_counts)
tb1[1:11]##
## 1 2 3 4 5 6 7 8 9 10 11
## 3302 1958 1280 1029 843 667 552 525 473 383 369
We remove those genes that are lowly expressed. (Habib et al. 2017) mentioned in section “Gene detection and quality controls”" of Online Methods
Nuclei with less than 200 detected genes and less than 10,000 usable reads were filtered out.
and
A gene is considered detected in a cell if it has at least two unique UMIs (transcripts) associated with it. For each analysis, genes were removed that were detected in less than 10 nuclei.
Therefore it seems we should remove all the cells having less than 200 genes with two or more UMI counts. However, this would remove majorify of the cells. Therefore, we conlcude that they meant to remove the cells having less than 200 genes with one or more UMI counts. To filter genes, we follow their threshold to remove genes with two or more UMIs in less than 10 nuclei.
Note taht the variable strand need to be renamed, otherwise there is an error message that such a variabel name cannot be used.
names(rowData(sce))[6] = "strand_n"
n.genes = colSums(counts(sce) >= 2)
summary(n.genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 42.0 71.0 112.1 132.0 1622.0
table(n.genes >= 100)##
## FALSE TRUE
## 9635 5328
table(n.genes >= 200)##
## FALSE TRUE
## 12902 2061
n.genes = colSums(counts(sce) >= 1)
summary(n.genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 198.0 336.0 528.0 656.8 824.0 4233.0
table(n.genes >= 100)##
## TRUE
## 14963
table(n.genes >= 200)##
## FALSE TRUE
## 3 14960
n.cells = rowSums(counts(sce) >= 2)
summary(n.cells)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.00 52.53 26.00 13369.00
table(n.cells >= 10)##
## FALSE TRUE
## 20649 11281
sce = sce[which(n.cells >= 10),]
dim(sce)## [1] 11281 14963
Next we check those highly expressed genes
par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1,
names.arg=rowData(sce)$hgnc_symbol[od1[20:1]],
horiz=TRUE, cex.names=0.8, cex.axis=0.8,
xlab="ave # of UMI")A simple solution for normalization and stablizing expression varaince across genes is to tranform the count data by log(count/size.factor + 1). One may calcualte size.factor per cell as the total number of UMIs, and this assumes the total expression are the same across all the cells. However, the total expression of each cell may vary with respect to cell type and/or cell size, and the computeSumFactors function in R package scran provides a more sophisicated way to calcualte size.factor to allow such variaation across cells (Lun, Bach, and Marioni 2016). computeSumFactors can use initial clustering of cells to normalize expression within and beetween clusters. Within a cluster, it estimates the size factor for many groups of cells so that there are more groups than cells, and then it can calcualte the size factor per cell using a lienar deconvolution system.
As shown in the following plot, the final size factor estimation is indeed highly correlated with the naive definition by total count.
Finally, the command normalize(sce) adds the normalized expression into the variable sce, which can be accessed by logcounts(sce) = log2(gene_cell_count / size_factor + 1).
date()## [1] "Thu Oct 4 23:16:57 2018"
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
table(clusters)## clusters
## 1 2 3 4 5
## 2574 1422 5728 1993 3246
date()## [1] "Thu Oct 4 23:17:49 2018"
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
date()## [1] "Thu Oct 4 23:20:07 2018"
summary(sizeFactors(sce))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.0807 0.4197 0.7436 0.9992 1.2504 11.0729
sort(sizeFactors(sce))[1:30]## [1] -0.0807031952 -0.0760340652 -0.0502627636 -0.0497516071 -0.0333388762
## [6] -0.0278556079 -0.0250192491 -0.0107172974 -0.0042912613 -0.0011366402
## [11] -0.0008049468 0.0062927996 0.0119193118 0.0290307001 0.0343657339
## [16] 0.0521217305 0.0683599087 0.0695881927 0.0711287055 0.0768475650
## [21] 0.0783454542 0.0791481088 0.0796070558 0.0813334824 0.0824948106
## [26] 0.0852443823 0.0878596351 0.0908209323 0.0938742916 0.0964200771
# Remove cells with negative or very small size factors
dim(sce)## [1] 11281 14963
sce = sce[,which(sizeFactors(sce) > 0.01)]
dim(sce)## [1] 11281 14951
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors",
cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
abline(h=0.05)dim(sce)## [1] 11281 14951
sce = sce[,which(sizeFactors(sce) > 0.05)]
dim(sce)## [1] 11281 14948
sce = normalize(sce) For dimension reduction, such as calculating PCA or performing TSNE, we should start by identifying a subset of genes with high level of biological signal relative to background (technical) noise. The decomposeVar function from R/cran is designed for this task.
new.trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6),
xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new.trend(x), col="red", lwd=2, add=TRUE)
legend("top", legend=c("Poisson noise", "observed trend"),
lty=1, lwd=2, col=c("red", "orange"), bty="n")fit$trend = new.trend
# obtain bio, FDR, pvalues for testing for HVGs (highly variable genes)
dec = decomposeVar(fit=fit)
top.dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top.dec)[1:10])When performing PCA, we can use all the genes or just those genes with high signal-to-noise ratio. TSNE analysis is usually based on the top PCs rather than the original gene expression data. We select around top 1000 genes for the PCA and use the top 50 PCs for TSNE projection.
library(svd)
library(Rtsne)
summary(dec$bio)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.022172 0.000947 0.004376 0.011828 0.010775 5.132832
dec1 = dec
dec1$bio[which(dec$bio < 1e-5)] = 1e-5
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100
par(mfrow=c(1,2))
hist(log10(dec1$bio), breaks=100, main="")
hist(log10(dec1$FDR), breaks=100, main="")summary(dec$FDR[dec$bio > 0.01])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 6.192e-06 0.000e+00 6.215e-03
summary(dec$FDR[dec$bio > 0.03])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 3.626e-11 0.000e+00 3.310e-08
table(dec$FDR < 1e-10, dec$bio > 0.03)##
## FALSE TRUE
## FALSE 4348 2
## TRUE 6011 920
# Subsetting genes based on FDR and biological residual thresholds
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.03) # original gene filter
sce_sub = sce[w2kp,]
sce_sub## class: SingleCellExperiment
## dim: 920 14948
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(920): AATK ABCA8 ... ZNF644 ZSWIM6
## rowData names(20): hgnc_symbol external_gene_name ...
## n_cells_counts pct_dropout_counts
## colnames(14948): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(59): is_cell_control total_features_by_counts ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(0):
## spikeNames(0):
# Extracting log2(norm_express+1)
edat = t(as.matrix(logcounts(sce_sub)))
edat = scale(edat)
dim(edat)## [1] 14948 920
edat[1:2,1:3]## AATK ABCA8 A2M
## hHP1_AACACTATCTAC -0.3209666 -0.2109691 -0.1499949
## hHP1_CTACGCATCCAT 0.2404263 -0.2109691 -0.1499949
# Perform SVD on sce_sub
date()## [1] "Thu Oct 4 23:20:52 2018"
ppk = propack.svd(edat,neig=50)
date()## [1] "Thu Oct 4 23:21:02 2018"
pca = t(ppk$d*t(ppk$u)) # calculates pc scores aka principal components
df_pcs = data.frame(pca)
df_pcs$log10_total_features = colData(sce_sub)$log10_total_features
df_pcs$part_cell_id = sapply(colnames(sce_sub),function(xx)
strsplit(xx,"_")[[1]][1],USE.NAMES=FALSE)
df_pcs[1:2,1:10]## X1 X2 X3 X4 X5 X6 X7
## 1 -0.6397168 3.029322 2.126551 0.2110893 -1.250563 -0.8411983 0.3393688
## 2 0.1510047 3.194259 1.433259 1.3759974 -1.516416 -1.5550675 0.2770684
## X8 X9 X10
## 1 0.3637856 -0.5986222 1.7335946
## 2 1.6677843 0.2782422 0.3932075
gp1 = ggplot(df_pcs, aes(X1,X2,col=log10_total_features)) +
geom_point(size=0.5,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1gp2 = ggplot(df_pcs, aes(X1,X2,col=part_cell_id)) +
geom_point(size=0.5,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3)))
gp2set.seed(100)
date()## [1] "Thu Oct 4 23:21:04 2018"
tsne = Rtsne(pca, pca = FALSE)
date()## [1] "Thu Oct 4 23:23:47 2018"
df_tsne = data.frame(tsne$Y)
df_tsne$log10_total_features = colData(sce_sub)$log10_total_features
df_tsne$part_cell_id = sapply(colnames(sce_sub),function(xx)
strsplit(xx,"_")[[1]][1],USE.NAMES=FALSE)
dim(df_tsne)## [1] 14948 4
df_tsne[1:2,]## X1 X2 log10_total_features part_cell_id
## 1 -2.571519 8.399695 3.393048 hHP1
## 2 -2.742694 5.133562 3.453624 hHP1
gp1 = ggplot(df_tsne, aes(X1,X2,col=log10_total_features)) +
geom_point(size=0.3,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1gp2 = ggplot(df_tsne, aes(X1,X2,col=part_cell_id)) +
geom_point(size=0.3,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3)))
gp2reducedDims(sce_sub) = SimpleList(PCA=pca, TSNE=tsne$Y)
sce_sub## class: SingleCellExperiment
## dim: 920 14948
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(920): AATK ABCA8 ... ZNF644 ZSWIM6
## rowData names(20): hgnc_symbol external_gene_name ...
## n_cells_counts pct_dropout_counts
## colnames(14948): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(59): is_cell_control total_features_by_counts ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
Next we cluster the cells using a simple kmeans method on the top 50 PCs. We set the number of clusters to be 12, to match with the 12 cell types reported by (Habib et al. 2017).
k10_50_pcs = kmeans(reducedDim(sce_sub, "PCA"),
centers = 12, iter.max=150, algorithm="MacQueen")
names(k10_50_pcs)## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
dim(k10_50_pcs$centers)## [1] 12 50
df_tsne$cluster_kmean = as.factor(k10_50_pcs$cluster)
gp1 = ggplot(df_tsne, aes(X1,X2,col=cluster_kmean)) +
geom_point(size=0.3,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3)))
gp1Next seek to cluster cells using another method SC3. The code used here is based on SC3 mannual. By default, when there are more than 5000 genes, SC3 will
selects a subset of cells uniformly at random (5,000), and obtains clusters from this subset. The inferred labels can be used to train a Support Vector Machine (SVM), which is employed to assign labels to the remaining cells.
By default, SC3 filter genes to select those with dropout percentage between 10 and 90%.
summary(rowData(sce)$pct_dropout_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.34 93.42 96.32 94.53 97.95 99.79
table(rowData(sce)$pct_dropout_counts < 90)##
## FALSE TRUE
## 9863 1418
This will end up with 1418 genes. However, we found the clustering resutls using these 1418 genes have considerable discrepency with clustering resutls from Kmeans and cell types reported by (Habib et al. 2017). Therefore, we chose to use those genes identified from previous step for SC3.
library(SC3)
rowData(sce_sub)$feature_symbol = rowData(sce_sub)$external_gene_name
date()## [1] "Thu Oct 4 23:23:54 2018"
all_ks = c(10,12)
sce_sub = sc3(sce_sub, gene_filter=FALSE, ks = all_ks, biology = TRUE,
rand_seed = 100)date()## [1] "Fri Oct 5 00:17:53 2018"
dim(colData(sce_sub))## [1] 14948 63
colData(sce_sub)[1:2,1:5]## DataFrame with 2 rows and 5 columns
## is_cell_control total_features_by_counts
## <logical> <integer>
## hHP1_AACACTATCTAC FALSE 2471
## hHP1_CTACGCATCCAT FALSE 2841
## log10_total_features_by_counts total_counts
## <numeric> <integer>
## hHP1_AACACTATCTAC 3.39304846641678 4484
## hHP1_CTACGCATCCAT 3.45362407359145 4919
## log10_total_counts
## <numeric>
## hHP1_AACACTATCTAC 3.65176244738011
## hHP1_CTACGCATCCAT 3.69196510276736
names(colData(sce_sub))## [1] "is_cell_control"
## [2] "total_features_by_counts"
## [3] "log10_total_features_by_counts"
## [4] "total_counts"
## [5] "log10_total_counts"
## [6] "pct_counts_in_top_50_features"
## [7] "pct_counts_in_top_100_features"
## [8] "pct_counts_in_top_200_features"
## [9] "pct_counts_in_top_500_features"
## [10] "total_features"
## [11] "log10_total_features"
## [12] "pct_counts_top_50_features"
## [13] "pct_counts_top_100_features"
## [14] "pct_counts_top_200_features"
## [15] "pct_counts_top_500_features"
## [16] "total_features_by_counts_endogenous"
## [17] "log10_total_features_by_counts_endogenous"
## [18] "total_counts_endogenous"
## [19] "log10_total_counts_endogenous"
## [20] "pct_counts_endogenous"
## [21] "pct_counts_in_top_50_features_endogenous"
## [22] "pct_counts_in_top_100_features_endogenous"
## [23] "pct_counts_in_top_200_features_endogenous"
## [24] "pct_counts_in_top_500_features_endogenous"
## [25] "total_features_endogenous"
## [26] "log10_total_features_endogenous"
## [27] "pct_counts_top_50_features_endogenous"
## [28] "pct_counts_top_100_features_endogenous"
## [29] "pct_counts_top_200_features_endogenous"
## [30] "pct_counts_top_500_features_endogenous"
## [31] "total_features_by_counts_feature_control"
## [32] "log10_total_features_by_counts_feature_control"
## [33] "total_counts_feature_control"
## [34] "log10_total_counts_feature_control"
## [35] "pct_counts_feature_control"
## [36] "pct_counts_in_top_50_features_feature_control"
## [37] "pct_counts_in_top_100_features_feature_control"
## [38] "total_features_feature_control"
## [39] "log10_total_features_feature_control"
## [40] "pct_counts_top_50_features_feature_control"
## [41] "pct_counts_top_100_features_feature_control"
## [42] "total_features_by_counts_Mt"
## [43] "log10_total_features_by_counts_Mt"
## [44] "total_counts_Mt"
## [45] "log10_total_counts_Mt"
## [46] "pct_counts_Mt"
## [47] "total_features_Mt"
## [48] "log10_total_features_Mt"
## [49] "total_features_by_counts_Ri"
## [50] "log10_total_features_by_counts_Ri"
## [51] "total_counts_Ri"
## [52] "log10_total_counts_Ri"
## [53] "pct_counts_Ri"
## [54] "pct_counts_in_top_50_features_Ri"
## [55] "pct_counts_in_top_100_features_Ri"
## [56] "total_features_Ri"
## [57] "log10_total_features_Ri"
## [58] "pct_counts_top_50_features_Ri"
## [59] "pct_counts_top_100_features_Ri"
## [60] "sc3_10_clusters"
## [61] "sc3_12_clusters"
## [62] "sc3_10_log2_outlier_score"
## [63] "sc3_12_log2_outlier_score"
table(colData(sce_sub)$sc3_12_clusters, useNA="ifany")##
## 1 2 3 4 5 6 7 8 9 10 11 12 <NA>
## 644 678 206 703 1043 292 263 43 597 27 402 102 9948
table(colData(sce_sub)$sc3_10_clusters, useNA="ifany")##
## 1 2 3 4 5 6 7 8 9 10 <NA>
## 1394 241 605 499 21 4 1054 643 73 466 9948
# Run SVM and predict labels of all other cells
date()## [1] "Fri Oct 5 00:17:53 2018"
sce_sub = sc3_run_svm(sce_sub, ks = all_ks)
date()## [1] "Fri Oct 5 00:20:14 2018"
table(colData(sce_sub)$sc3_12_clusters, useNA="ifany")##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 1938 687 988 879 2358 742 1590 2322 1209 517 432 1286
table(colData(sce_sub)$sc3_10_clusters, useNA="ifany")##
## 1 2 3 4 5 6 7 8 9 10
## 4376 1179 1034 1683 980 13 1055 2714 1340 574
sum(table(colData(sce_sub)$sc3_12_clusters))## [1] 14948
sum(table(colData(sce_sub)$sc3_10_clusters))## [1] 14948
We obtainted the cell type and clustering resutls from (Habib et al. 2017). Supplementary Table 10: supplement nmeth.4407-S10.xlsx file. Here we compare the cell type reported by Habib et al. (2017) with ours. Habib et al. (2017) identified genes with high variation by fitting a gamma distribution on the relation between mean and coefficient of variation and chose the number of PCs based on “the largest eigen value gap”. It was not clear what is the number of PCs used. Then they used these top PCs to perform tSNE anlaysis and clustering using a graph based method.
source(file.path(src_dir,"SOURCE.R"))
tmp_lab = smart_RT(file.path(src_dir, "cluster_num_label.txt"),
sep = "\t",header = TRUE)
tmp_lab = name_change(tmp_lab, "Name", "Cluster.Name")
tmp_lab = name_change(tmp_lab, "Name.1", "Cell_Type")
dim(tmp_lab)## [1] 15 4
tmp_lab[1:2,]## Cell.ID Cluster.Name Cell.Type.ID Cell_Type
## 1 1 exPFC1 1 exPFC
## 2 2 exPFC2 1 exPFC
tmp_res = smart_RT(file.path(src_dir, "paper_cluster_res.txt"),
sep = "\t", header = TRUE, comment.char = "")
tmp_res = name_change(tmp_res, "X.Genes", "nGenes")
tmp_res = name_change(tmp_res, "X.Transcripts", "nTranscripts")
tmp_res = smart_merge(tmp_res, tmp_lab[,c("Cluster.Name","Cell_Type")],
all.x=TRUE)
dim(tmp_res)## [1] 14963 6
tmp_res[1:2,]## Cluster.Name Cell.ID nGenes nTranscripts Cluster.ID Cell_Type
## 1 ASC1 hCd_TCCTTCCGTAAA 334 515 8 ASC
## 2 ASC1 hCd_ACGCCCAGGGCG 465 608 8 ASC
summary(tmp_res$nGenes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 201.0 337.0 529.0 658.4 826.0 4249.0
hist(tmp_res$nGenes, breaks=50, col="gray", main="",
xlab="number of genes per cell")df_tsne$Cell.ID = colnames(sce_sub)
# Merge and compare
all_clust_res = smart_merge(df_tsne, tmp_res)
sc3_res = smart_df(colData(sce_sub)[,c("sc3_12_clusters", "sc3_10_clusters")])
sc3_res$Cell.ID = colnames(sce_sub)
all_clust_res = smart_merge(all_clust_res, sc3_res)
dim(all_clust_res)## [1] 14948 13
all_clust_res[1:5,]## Cell.ID X1 X2 log10_total_features part_cell_id
## 1 hCc_AAAAAGCTACAA 10.451712 -10.5185553 2.866287 hCc
## 2 hCc_AAACAGGTGAGG 25.893491 1.3019993 3.170262 hCc
## 3 hCc_AAACCCTTTACA 23.711009 -0.9235069 3.044540 hCc
## 4 hCc_AAACGTGACGGA 20.799815 -2.1608760 2.849419 hCc
## 5 hCc_AAAGAACTCGCG 2.945587 -3.9327487 2.716838 hCc
## cluster_kmean Cluster.Name nGenes nTranscripts Cluster.ID Cell_Type
## 1 10 exPFC1 736 1034 1 exPFC
## 2 3 GABA1 1480 2391 5 GABA
## 3 10 GABA1 1107 1757 5 GABA
## 4 10 GABA1 708 935 5 GABA
## 5 10 exPFC1 520 673 1 exPFC
## sc3_12_clusters sc3_10_clusters
## 1 5 1
## 2 8 1
## 3 5 1
## 4 8 1
## 5 5 1
smart_table(all_clust_res[,c("Cell_Type","Cluster.ID")])## Cluster.ID
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 0 0 0 0 0 0 0 1204 705 0 0 0 0
## END 0 0 0 0 0 0 0 0 0 0 0 0 0
## exCA1 0 0 421 0 0 0 0 0 0 0 0 0 0
## exCA3 0 0 0 745 0 0 0 0 0 0 0 0 0
## exDG 0 0 0 0 0 0 1453 0 0 0 0 0 0
## exPFC 3104 297 0 0 0 0 0 0 0 0 0 0 0
## GABA 0 0 0 0 892 823 0 0 0 0 0 0 0
## MG 0 0 0 0 0 0 0 0 0 0 0 0 389
## NSC 0 0 0 0 0 0 0 0 0 0 0 0 0
## ODC 0 0 0 0 0 0 0 0 0 1718 1247 0 0
## OPC 0 0 0 0 0 0 0 0 0 0 0 684 0
## <NA> 0 0 0 0 0 0 0 0 0 0 0 0 0
## Cluster.ID
## Cell_Type 14 15 16 17 18
## ASC 0 0 0 0 0
## END 0 254 0 0 0
## exCA1 0 0 0 0 0
## exCA3 0 0 0 0 0
## exDG 0 0 0 0 0
## exPFC 0 0 0 0 0
## GABA 0 0 0 0 0
## MG 0 0 0 0 0
## NSC 201 0 0 0 0
## ODC 0 0 0 0 0
## OPC 0 0 0 0 0
## <NA> 0 0 200 123 488
smart_table(all_clust_res[,c("Cell_Type","cluster_kmean")])## cluster_kmean
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 3 5 6 1 27 1790 3 28 0 35 11 0
## END 0 0 0 237 3 3 1 1 0 9 0 0
## exCA1 0 2 5 0 10 6 20 71 0 88 219 0
## exCA3 0 1 4 0 4 0 4 545 0 94 93 0
## exDG 0 0 0 0 3 1 2 86 0 16 1345 0
## exPFC 6 9 16 5 67 51 9 378 0 2817 43 0
## GABA 1 1 1045 1 3 2 47 99 0 505 11 0
## MG 3 6 3 279 25 8 1 11 0 39 14 0
## NSC 0 0 0 0 0 8 0 1 180 0 12 0
## ODC 6 1093 3 0 1814 6 1 6 0 24 12 0
## OPC 631 4 0 0 6 3 1 15 0 20 4 0
## <NA> 5 11 13 10 42 15 266 20 1 14 266 148
smart_table(all_clust_res[,c("Cell_Type","sc3_12_clusters")])## sc3_12_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 1808 4 6 10 13 3 14 23 10 3 6 9
## END 3 1 0 81 3 0 155 3 3 3 0 2
## exCA1 7 3 6 4 32 86 8 82 158 5 12 18
## exCA3 1 1 5 0 55 181 2 95 395 4 3 3
## exDG 1 0 1 0 9 4 1 22 472 1 4 938
## exPFC 56 21 582 21 750 11 26 1522 44 12 323 33
## GABA 4 2 59 154 205 5 232 491 5 446 23 89
## MG 11 4 3 117 13 2 212 12 6 6 0 3
## NSC 8 0 0 74 0 0 113 0 1 2 0 3
## ODC 10 630 7 347 1238 4 693 15 3 6 4 8
## OPC 6 5 201 1 8 432 3 13 2 9 1 3
## <NA> 23 16 118 70 32 14 131 44 110 20 56 177
smart_table(all_clust_res[,c("Cell_Type","sc3_10_clusters")])## sc3_10_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10
## ASC 37 15 11 15 9 3 12 628 1179 0
## END 5 151 1 1 4 1 1 5 4 81
## exCA1 345 5 17 24 11 1 5 7 6 0
## exCA3 711 4 11 11 5 0 1 2 0 0
## exDG 56 7 466 918 1 0 0 4 1 0
## exPFC 2268 26 30 354 587 1 32 57 40 6
## GABA 827 187 355 114 221 1 1 4 5 0
## MG 14 205 8 2 6 1 8 24 11 110
## NSC 1 113 2 3 2 0 0 0 5 75
## ODC 14 2 8 10 12 3 972 1940 3 1
## OPC 14 430 7 4 6 1 2 9 14 197
## <NA> 84 34 118 227 116 1 21 34 72 104
t2 = melt(smart_table(all_clust_res[,c("Cell_Type","cluster_kmean")]))
colnames(t2)[2] = "cluster"
summary(t2$value)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 103.8 20.0 2817.0
gg.heatmap(t2, "kmeans 12 clusters")t2 = melt(smart_table(all_clust_res[,c("Cell_Type","sc3_12_clusters")]))
colnames(t2)[2] = "cluster"
summary(t2$value)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 8.50 103.81 61.75 1808.00
gg.heatmap(t2, "sc3 12 clusters")t2 = melt(smart_table(all_clust_res[,c("Cell_Type","sc3_10_clusters")]))
colnames(t2)[2] = "cluster"
summary(t2$value)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 8.00 124.57 60.75 2268.00
gg.heatmap(t2, "sc3 10 clusters")We plot our TSNE colored by our clustering results.
gp1 = ggplot(all_clust_res, aes(X1,X2,col=Cell_Type)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("cell type")
gp1gp1 = ggplot(all_clust_res, aes(X1,X2,col=cluster_kmean)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("kmeans 12 clusters")
gp1gp1 = ggplot(all_clust_res, aes(X1,X2,col=factor(sc3_10_clusters))) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("sc3 10 clusters")
gp1gp1 = ggplot(all_clust_res, aes(X1,X2,col=factor(sc3_12_clusters))) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("sc3 12 clusters")
gp1Next we plot the TSNE by Habib et al. (2017). and colored by our clustering results.
paper_tsne = smart_RT(file.path(dronc_dir, "GTEx_droncseq_hip_pcf.tsne.txt"),
sep='\t',header=TRUE,row.names=1)
paper_tsne$Cell.ID = rownames(paper_tsne)
rownames(paper_tsne) = NULL
all_clust_res = smart_merge(all_clust_res,paper_tsne)
gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=Cell_Type)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("cell types")
gp1gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=cluster_kmean)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("kmeans 12 clusters")
gp1gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=factor(sc3_10_clusters))) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("sc3 10 clusters")
gp1gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=factor(sc3_12_clusters))) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle("sc3 12 clusters")
gp1Finally we save the sce object and the clustering results
sce_fn = file.path(dronc_dir, "sce.rds")
saveRDS(sce, sce_fn)
sce_sub_fn = file.path(dronc_dir, "sce_sub.rds")
saveRDS(sce_sub, sce_sub_fn)
all_clust_res_fn = file.path(dronc_dir, "all_clust_res.rds")
saveRDS(all_clust_res, all_clust_res_fn)sessionInfo()## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] SC3_1.8.0 Rtsne_0.13
## [3] svd_0.4.1 limma_3.36.5
## [5] scran_1.8.4 scater_1.8.4
## [7] ggplot2_3.0.0 biomaRt_2.36.1
## [9] DropletUtils_1.0.3 SingleCellExperiment_1.2.0
## [11] SummarizedExperiment_1.10.1 DelayedArray_0.6.4
## [13] BiocParallel_1.14.2 matrixStats_0.54.0
## [15] Biobase_2.40.0 GenomicRanges_1.32.6
## [17] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [19] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [21] data.table_1.11.4
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.6.0 colorspace_1.3-2
## [3] rjson_0.2.20 class_7.3-14
## [5] dynamicTreeCut_1.63-1 rprojroot_1.3-2
## [7] XVector_0.20.0 DT_0.4
## [9] bit64_0.9-7 mvtnorm_1.0-8
## [11] AnnotationDbi_1.42.1 codetools_0.2-15
## [13] tximport_1.8.0 doParallel_1.0.11
## [15] robustbase_0.93-2 knitr_1.20
## [17] cluster_2.0.7-1 pheatmap_1.0.10
## [19] shinydashboard_0.7.0 shiny_1.1.0
## [21] rrcov_1.4-4 compiler_3.5.0
## [23] httr_1.3.1 backports_1.1.2
## [25] assertthat_0.2.0 Matrix_1.2-14
## [27] lazyeval_0.2.1 later_0.7.3
## [29] htmltools_0.3.6 prettyunits_1.0.2
## [31] tools_3.5.0 bindrcpp_0.2.2
## [33] igraph_1.2.2 gtable_0.2.0
## [35] glue_1.3.0 GenomeInfoDbData_1.1.0
## [37] reshape2_1.4.3 dplyr_0.7.6
## [39] doRNG_1.7.1 Rcpp_0.12.18
## [41] gdata_2.18.0 iterators_1.0.10
## [43] DelayedMatrixStats_1.2.0 stringr_1.3.1
## [45] mime_0.5 irlba_2.3.2
## [47] rngtools_1.3.1 gtools_3.8.1
## [49] WriteXLS_4.0.0 statmod_1.4.30
## [51] XML_3.98-1.15 DEoptimR_1.0-8
## [53] edgeR_3.22.3 zlibbioc_1.26.0
## [55] scales_1.0.0 hms_0.4.2
## [57] promises_1.0.1 rhdf5_2.24.0
## [59] RColorBrewer_1.1-2 yaml_2.2.0
## [61] memoise_1.1.0 gridExtra_2.3
## [63] pkgmaker_0.27 stringi_1.2.4
## [65] RSQLite_2.1.1 pcaPP_1.9-73
## [67] foreach_1.4.4 e1071_1.7-0
## [69] caTools_1.17.1.1 bibtex_0.4.2
## [71] rlang_0.2.1 pkgconfig_2.0.1
## [73] bitops_1.0-6 evaluate_0.11
## [75] lattice_0.20-35 ROCR_1.0-7
## [77] purrr_0.2.5 Rhdf5lib_1.2.1
## [79] bindr_0.1.1 htmlwidgets_1.2
## [81] labeling_0.3 cowplot_0.9.3
## [83] bit_1.1-14 tidyselect_0.2.4
## [85] plyr_1.8.4 magrittr_1.5
## [87] R6_2.2.2 gplots_3.0.1
## [89] DBI_1.0.0 pillar_1.3.0
## [91] withr_2.1.2 RCurl_1.95-4.11
## [93] tibble_1.4.2 crayon_1.3.4
## [95] KernSmooth_2.23-15 rmarkdown_1.10
## [97] viridis_0.5.1 progress_1.2.0
## [99] locfit_1.5-9.1 grid_3.5.0
## [101] blob_1.1.1 FNN_1.1.2.1
## [103] digest_0.6.15 xtable_1.8-2
## [105] httpuv_1.4.5 munsell_0.5.0
## [107] registry_0.5 beeswarm_0.2.3
## [109] viridisLite_0.3.0 vipor_0.4.5
Habib, Naomi, Inbal Avraham-Davidi, Anindita Basu, Tyler Burks, Karthik Shekhar, Matan Hofree, Sourav R Choudhury, et al. 2017. “Massively Parallel Single-Nucleus Rna-Seq with Dronc-Seq.” Nature Methods 14 (10). Nature Publishing Group: 955.
Lun, Aaron TL, Karsten Bach, and John C Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell Rna Sequencing Data with Many Zero Counts.” Genome Biology 17 (1). BioMed Central: 75.